from multiprocessing import Process
from bs4 import BeautifulSoup
from pymongo import MongoClient

import os
import random
import sys

sys.setrecursionlimit(1000000)  # 例如这里设置为一百万

import requests
import time
import re

client = MongoClient('127.0.0.1', 27017, connect = False)

baijiahao_db = client['baijiahao_db']
baijiahao_coll = baijiahao_db['baijiahao_coll']

data = {}


def scrapy_post(url):
    if len(data) > 1000:
        return

    try:
        r = requests.get(url)
    except:
        return scrapy_post(url)

    soup = BeautifulSoup(r.text)

    if soup.find(class_ = 'article-content') and soup.find(class_ = 'article-title'):
        t = soup.find(class_ = 'article-source').get_text()
        title = soup.find(class_ = 'article-title').get_text().strip()
        article = soup.find(class_ = 'article-content').get_text().strip()

        data[url] = 1
        print('=' * 100)

        if not baijiahao_coll.find_one({'title': title}):
            baijiahao_coll.insert_one({'title': title, 'article': article, 'time': t, 'url': url})

            print(title)
            print()
            print(article)
        else:
            print(title, '已抓取')

    links = [i.get('href') for i in soup.findAll('a') if 'mbd.baidu.com' in i.get('href') and i.get('href') not in data]
    random.shuffle(links)

    for i in links:
        scrapy_post(i)


if __name__ == '__main__':
    proc = []

    for item in baijiahao_coll.aggregate([{'$sample': {'size': 20}}]):
        p = Process(target = scrapy_post, args = (item['url'],))
        p.start()
        proc.append(p)

    for p in proc:
        p.join()

    os.system("python3 lagou_post_by_search.py")
